#Set up

#s1
rm(list=ls())

setwd ("C:/Users/cecidy/Google Drive/ZH_analysis/")

source("ZHfunctions.R")
options(scipen=9999)

ZeroHunger <- read.csv("Data/ZHdf/ZeroHungerDF_2020.csv",header=TRUE, sep=",")

##################################################################################################

#s2

###############################################################################################################

#start1_

#BF and Education (scool attendance) 00-10

#######################################################################################################################

#Make a nonBF ZHvariable!

ZeroHunger$ZHPerCapTotal00to10for2000Analysis1000NOBF <- with(ZeroHunger,ZHPerCapTotal00to10for2000Analysis1000-BFPerCapTotal00to10for2000Analysis1000)

#######################################################################################################################

#1. Prepare each dataframe

Poverty04 <- subset(ZeroHunger, PopDensity20002000Analysis<=150)

load(file = "Data/ForSamples/MPI00list.rda")
load(file = "Data/ForSamples/Basic00.rda")

RuralPoverty04 <- subset(Poverty04, select=c("BFPerCapTotal00to10for2000Analysis1000","ZHPerCapTotal00to10for2000Analysis1000NOBF",Basic00,MPI00List,
                                             "PercChildren7to14SchoolAttendance2000for2000Analysis",
                                             "PercChildren7to14SchoolAttendance2010for2000Analysis"))

RuralPoverty04 <- na.omit(RuralPoverty04)

load(file = "Data/ForSamples/EcludeForRCProb00.rda")

RuralPoverty04 <- subset(RuralPoverty04,(!(IBGECode7digit %in% EcludeForRCProb00)))

RuralPoverty04quality <- RuralPoverty04

#Make logged variables

#First make MPI with the correct 0s
test <- NrOfZeroALL(RuralPoverty04quality)#This gives all the vars with 0s
MPIadd <- test[[1]]
Otheradd <- test[[2]]

WithAdded <- lapply(MPIadd,function(y) AddConstant(RuralPoverty04quality,y,addName=""))
WithAdded <- data.frame(do.call("cbind",WithAdded))

RuralPoverty04quality[,MPIadd] <- WithAdded[,MPIadd]#just override

#Then need to make the MPI - here use the 00 function
RuralPoverty04quality <- MakeMPI00(RuralPoverty04quality)

#Then for the others I cbind them on, and because Ive not included any of previous noCero there no duplicates
RuralPoverty04quality <- cbind(RuralPoverty04quality,lapply(Otheradd,function(y) AddConstant(RuralPoverty04quality,y,addName="nocero")))#

#################################################################################################

#Log variables
OtheraddCero <- paste(Otheradd,"nocero",sep="")

cols.log <- c("BFPerCapTotal00to10for2000Analysis1000","ZHPerCapTotal00to10for2000Analysis1000NOBF","MPI2000for2000Analysis","MPI2010for2000Analysis",
              "GDPPerCapPublicrealN2000for2000Analysis1000",OtheraddCero,"RemoteMinPopFifty2000Analysis",
              "PopDensity20002000Analysis","SizeKm22000Analysis","MeanSlopefor2000analysis","MeanElevationfor2000analysis")
addlog <- function(x) log10(x)
Log10variables <- data.frame(sapply(RuralPoverty04quality[cols.log],addlog))
colnames(Log10variables) <- paste(colnames(Log10variables), "log10",sep="")

#Get all onto ZH
RuralPoverty04quality <- cbind(RuralPoverty04quality,Log10variables)

###############################################################################################################

#Remove those I dont need
Log10variables <- NULL
Poverty04 <- NULL
WithAdded <- NULL
ZeroHunger <- NULL

#############################################################################################

#Make scaled and logged ZH variables, and set contrasts (obs contrast is also set afresh in analysis script)

RuralPoverty04quality<- getDataFrame(RuralPoverty04quality,"contr.Sum",
                                     contrastVar1="stateName",contrastVar2="Biomefor2000Analysis",setRefToMostCommonLevel,
                                     BaseVar="BFPerCapTotal00to10for2000Analysis1000log10",
                                     MainName="BFPerCapTotal00to10for2000Analysis1000log10scaledMain")

#fin1_

###############################################################################################################


#start2_

#Prepare vectors of variables for regressions

###############################################################################################################

IndepVarLin <- "BFPerCapTotal00to10for2000Analysis1000log10scaledMain"#already scaled so dont need to do it again

intLin <- c("BFPerCapTotal00to10for2000Analysis1000log10scaledMain","stateName")
intLin <- paste(intLin,collapse="*")

Depvar <- "PercChildren7to14SchoolAttendance2010for2000Analysis"
DepVarLog <- "PercChildren7to14SchoolAttendance2010for2000Analysis"

baselineDepVar <- "scale(PercChildren7to14SchoolAttendance2000for2000Analysis)"

stateName <- "stateName"

xvars <- c("scale(MPI2000for2000Analysis)","scale(GDPPerCapPublicrealN2000for2000Analysis1000log10)",
           "scale(TotalKcal2000percapperdaynocerolog10)",
           "scale(TotalHectareCrops2000for2000Analysisnocerolog10)","scale(TotalHectarePasture2006for2000Analysisnocerolog10)",
           "scale(TotalHectareFarmsLess502006for2000Analysisnocerolog10)","scale(RemoteMinPopFifty2000Analysislog10)",
           "scale(TotRCpcNOPRONAF00to10for2000Analysis1000nocerolog10)",
           "scale(ChangeCummulativeSPEI98and00to08and10for2000Analysis)",
           "scale(MeanElevationfor2000analysislog10)","scale(MeanSlopefor2000analysislog10)",
           "scale(PopDensity20002000Analysislog10)","scale(SizeKm22000Analysislog10)","scale(ZHPerCapTotal00to10for2000Analysis1000NOBFlog10)",
           "Biomefor2000Analysis")

cbpsDepVar <- "BFPerCapTotal00to10for2000Analysis1000log10"#Keep this logged because I call the non-logged non-logged below!

cbpsbaselineDepVar <- "PercChildren7to14SchoolAttendance2000for2000Analysis"

cbpsVars <- c("MPI2000for2000Analysis","GDPPerCapPublicrealN2000for2000Analysis1000log10","TotalKcal2000percapperdaynocerolog10",
              "TotalHectareCrops2000for2000Analysisnocerolog10","TotalHectarePasture2006for2000Analysisnocerolog10",
              "TotalHectareFarmsLess502006for2000Analysisnocerolog10","RemoteMinPopFifty2000Analysislog10",
              "TotRCpcNOPRONAF00to10for2000Analysis1000nocerolog10",
              "ChangeCummulativeSPEI98and00to08and10for2000Analysis",
              "MeanElevationfor2000analysislog10","MeanSlopefor2000analysislog10",
              "PopDensity20002000Analysislog10","SizeKm22000Analysislog10","ZHPerCapTotal00to10for2000Analysis1000NOBFlog10",
              "stateName","Biomefor2000Analysis")


#fin2_

###############################################################################################################################
###############################################################################################################################

#Here source the above AND get in the election variable so that ready to make the weights and final model!

prepareFunc <- function(namefolders,script,yearsample,timeperiod,progr,biomvar){
  result=NULL
  {
    #1.Load in the df 
    script <- paste(namefolders,script,sep="")
    
    sourcePartial(script,startTag="#s1",endTag="#s2")
    sourcePartial(script,startTag="#start1_",endTag="#fin1_")
    sourcePartial(script,"#start2_","#fin2_")
    
    print(c(cbpsDepVar,cbpsbaselineDepVar))
    
    #2. Get on the election variable
    ElecDF <- read.csv(paste("Data/Elections/AllElectionPerMun_",yearsample,sep="",paste("sample_Fin.csv",sep="")),
                       header=TRUE, sep=",")
    
    ElecDFsub <- subset(ElecDF, select=c("codeibge",paste("PresElec_vs_",timeperiod,sep="",paste("_w_",progr,sep=""))))
    names(ElecDFsub)[1] <- "IBGECode7digit"
    
    #it has to be the same sample as rural!
    RuralPoverty04quality <- merge(x=RuralPoverty04quality,y=ElecDFsub,by="IBGECode7digit",all.x=T)
    
    #get sample information
    print(length(RuralPoverty04quality[,1]))
    elecvar <- length(names(RuralPoverty04quality))
    print(length(which(is.na(RuralPoverty04quality[,elecvar]))))
    RuralPoverty04quality <- RuralPoverty04quality[!is.na(RuralPoverty04quality[,elecvar]),]
    print(length(RuralPoverty04quality[,1]))
    
    #extra - set the correct reference state/biome
    #2. Get the reference levels (want this to be the ones with lowest)
    #2.1. Cannot have empty levels
    RuralPoverty04quality$stateName <- droplevels(RuralPoverty04quality$stateName)
    RuralPoverty04quality[,biomvar] <- droplevels(RuralPoverty04quality[,biomvar])
    
    #2.2. Find the category with least observations
    
    #State
    onetabS <- data.frame(table(RuralPoverty04quality[,"stateName"]))
    onetabS <- onetabS [order(onetabS$Freq),]
    onetabS$Var1 <- as.character(onetabS$Var1)
    
    smallestS <- onetabS[1,1]
    
    #Biome
    onetabB <- data.frame(table(RuralPoverty04quality[,biomvar]))
    onetabB <- onetabB [order(onetabB$Freq),]
    onetabB$Var1 <- as.character(onetabB$Var1)
    
    smallestB <- onetabB[1,1]
    
    #2.3. Set the smallest state/biome as the reference (the last one)
    
    #state Reference
    stateSeq <- levels(RuralPoverty04quality$stateName)
    stateSeqNoRef <- stateSeq [! stateSeq %in% smallestS]
    stateSeqFin <- c(stateSeqNoRef,smallestS)#this ensures that the smallest one is last!
    
    #biome Reference
    biomeSeq <- levels(RuralPoverty04quality[,biomvar])
    biomeSeqNoRef <- biomeSeq [! biomeSeq %in% smallestB]
    biomeSeqFin <- c(biomeSeqNoRef,smallestB)#this ensures that the smallest one is last!
    
    #set the level
    RuralPoverty04quality$stateName <- factor(RuralPoverty04quality$stateName, levels = stateSeqFin)
    RuralPoverty04quality[,biomvar] <- factor(RuralPoverty04quality[,biomvar], levels = biomeSeqFin)
    
    #2.4. Set the contrast
    contrasts(RuralPoverty04quality$stateName) <- "contr.Sum"
    contrasts(RuralPoverty04quality[,biomvar]) <- "contr.Sum"
    
    #Then it is ready to make weights and model!
    
    result <- RuralPoverty04quality
  }
}

RuralPoverty04quality <- prepareFunc("ProgrammeOutcome/BF/Education00to10/",
                                     "25.BFandEducation00.R","2000","2000to2010","BF",
                                     "Biomefor2000Analysis")
head(RuralPoverty04quality)

###############################################################################################################################
###############################################################################################################################

#Logging do not create a normally distributed variable

hist(RuralPoverty04quality$PercChildren7to14SchoolAttendance2000for2000Analysis)
hist(log10(RuralPoverty04quality$PercChildren7to14SchoolAttendance2000for2000Analysis))

#Rather use bestNormalize package to find a transformation that suits
library(bestNormalize)

#Make the transformation varible!
(BNobject <- bestNormalize(RuralPoverty04quality$PercChildren7to14SchoolAttendance2010for2000Analysis))
head(BNobject)

?bestNormalize
?orderNorm

#for baseline variable
(BNobjectbase <- bestNormalize(RuralPoverty04quality$PercChildren7to14SchoolAttendance2000for2000Analysis))

###############################################################################################

#Here is the final transformed education variable to use
RuralPoverty04quality <- cbind(RuralPoverty04quality,BNobject$x.t)
colnames(RuralPoverty04quality)[which(names(RuralPoverty04quality) == "BNobject$x.t")] <- "PercChildren7to14SchoolAttendance2010for2000Analysis_orderNorm"


RuralPoverty04quality <- cbind(RuralPoverty04quality,BNobjectbase$x.t)
colnames(RuralPoverty04quality)[which(names(RuralPoverty04quality) == "BNobjectbase$x.t")] <- "PercChildren7to14SchoolAttendance2000for2000Analysis_orderNorm"
head(RuralPoverty04quality)

hist(RuralPoverty04quality$PercChildren7to14SchoolAttendance2010for2000Analysis_orderNorm)
hist(RuralPoverty04quality$PercChildren7to14SchoolAttendance2000for2000Analysis_orderNorm)

#######################################################################################################

#Then update - prepare all vectors needed to make the weights and model!

#THese are new!
Depvar <- "PercChildren7to14SchoolAttendance2010for2000Analysis_orderNorm"
DepVarLog <- "PercChildren7to14SchoolAttendance2010for2000Analysis_orderNorm"

baselineDepVar <- "scale(PercChildren7to14SchoolAttendance2000for2000Analysis_orderNorm)"

cbpsbaselineDepVar <- "PercChildren7to14SchoolAttendance2000for2000Analysis_orderNorm"

######################################################################################################

#3.Make cbps formula, with election variable, and create a new formula
my.formula <- as.formula(reformulate(termlabels=c(cbpsbaselineDepVar,cbpsVars),response=cbpsDepVar))
addTerm <- paste("PresElec_vs_","2000to2010",sep="",paste("_w_","BF",sep=""))
newmodformula <- update(my.formula,paste("~. +", addTerm))#makes a new formula with the added variable

######################################################################################################

#4. Make the CBPS weight
fit2 <- CBPS(newmodformula, data=RuralPoverty04quality)
CBPStabletest <- CBPStable(fit2,"Education00")
#the improvement is good, use

#5. But I would want to be able to save this weight and name it something different
save(fit2, file = paste("ProgrammeOutcome/BF/Education00to10/","fit2fullcoreElectionW",sep="",paste(".rda",sep="")))


#########################################################################################################
#########################################################################################################

#Run regression models (do manually/without function)

cbpsweights <- fit2

FullList <- FullModListRedElectionScale("2000to2010","BF",Depvar,IndepVarLin, baselineDepVar,stateName, xvars,intLin)

##########################################################################################################


#Linear
my.formula <- FullList[[1]]
Modlinear <- lmrob( my.formula,weights = fit2$weights, data=RuralPoverty04quality,fast.s.large.n = Inf,max.it=500,k.max=2000)
summary(Modlinear)
#adj r2 = 

#State linear
my.formula <- FullList[[3]]
ModStatelinear <- lmrob( my.formula,weights = fit2$weights, data=RuralPoverty04quality,fast.s.large.n = Inf,max.it=500,k.max=2000)
summary(ModStatelinear)

summary(ModStatelinear)$adj.r-summary(Modlinear)$adj.r#State linear model is better
Anova(ModStatelinear, type="3")#yes Anova is significant

#So state linear model is the best model

################################################################################

#Save them

#I save the two
save(Modlinear, file = "ProgrammeOutcome/BF/Education00to10/ModlinearElection.rda")
save(ModStatelinear, file = "ProgrammeOutcome/BF/Education00to10/ModStatelinearElection.rda")

##################################################################################
